#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
#          Test scheduling of gres/gpu and gres/mps
############################################################################
# Copyright (C) 2018 SchedMD LLC
# Written by Morris Jette
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
############################################################################
source ./globals

set file_in1    "$test_dir/input1"
set file_in2    "$test_dir/input2"
set file_out1   "$test_dir/output1"
set file_out2   "$test_dir/output2"
set job_id      0
set job_list    {}
set mps_cnt     100

if {![check_config_select "cons_tres"]} {
	skip "This test is only compatible with select/cons_tres"
}

set node_with_mps [get_nodes_by_request "--gres=mps:100 -t1"]
if {[llength $node_with_mps] != 1} {
	skip "This test requires being able to submit job with --gres=mps:100"
}

proc cleanup {} {
	global job_list

	cancel_job $job_list
}

#
# Simple MPS request, check environment variables
#
log_info "==== TEST 1 ===="
make_bash_script $file_in1 "
echo HOST:\$SLURMD_NODENAME
echo CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES
echo CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
exit 0"
set timeout $max_job_delay
set device -1
set host "unknown"
set match 0
set percentage -1
set target_job  [expr $mps_cnt / 2]
spawn $srun --gres=mps:$target_job -w $node_with_mps -n1 -t1 -J $test_name $file_in1
expect {
	-re "HOST:($re_word_str)" {
		incr match
		set host $expect_out(1,string)
		exp_continue
	}
	-re "CUDA_VISIBLE_DEVICES:($number)" {
		subtest {$expect_out(1,string) == 0} "Verify CUDA_VISIBLE_DEVICES == 0" "$expect_out(1,string) != 0"
		incr match
		exp_continue
	}
	-re "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
		incr match
		exp_continue
	}
	timeout {
		fail "srun not responding"
	}
	eof {
		wait
	}
}
subtest {$match == 3} "Verify expected output" "$match != 3"

#
# Run two steps in parallel to consume gres/mps using sbatch
#
log_info "==== TEST 2 ===="
set target_job  [expr $mps_cnt / 2]
set target_step [expr $target_job / 2]
make_bash_script $file_in1 "
echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
$bin_sleep 5
exit 0"
make_bash_script $file_in2 "
$srun --overlap --gres=mps:$target_job $file_in1 &
wait
$bin_date
$srun --overlap --gres=mps:$target_step $file_in1 &
$srun --overlap --gres=mps:$target_step $file_in1 &
wait
$bin_date
exit 0"
exec $bin_rm -f $file_out1
set job_id 0
spawn $sbatch --gres=mps:$target_job -w $node_with_mps -n1 -t1 -o $file_out1 -J $test_name $file_in2
expect {
	-re "Submitted batch job ($number)" {
		set job_id $expect_out(1,string)
		lappend job_list $job_id
		exp_continue
	}
	timeout {
		fail "sbatch not responding"
	}
	eof {
		wait
	}
}
if {$job_id == 0} {
	fail "Job not submitted"
}
wait_for_job -fail $job_id "DONE"
wait_for_file -fail $file_out1
set match 0
spawn $bin_cat $file_out1
expect {
	-re "CUDA_VISIBLE_DEVICES:($number) CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
		subtest {$expect_out(1,string) == 0} "Verify CUDA_VISIBLE_DEVICES == 0" "$expect_out(1,string) != 0"
		incr match
		if {$match == 1} {
			set perc $expect_out(2,string)
		} else {
			if {$match == 2} {
				set perc_sum $expect_out(2,string)
			} else {
				if {[subtest {$expect_out(2,string) == $perc_sum} "Verify CUDA percentage == $perc_sum" "$expect_out(2,string) != $perc_sum"]} {
					subtest {[expr $perc - $perc_sum - $expect_out(2,string)] <= 1} "Verify CUDA percentage" "$perc != $perc_sum - $expect_out(2,string)"
				}
			}
		}
		exp_continue
	}
	eof {
		wait
	}
}
subtest {$match == 3} "Verify CUDA information about job" "$match != 3"

#
# Run two steps in parallel to consume gres/mps using salloc
# Reuse scripts from test 2 above
#
log_info "==== TEST 3 ===="
set match 0
spawn $salloc --gres=mps:$target_job -w $node_with_mps -n1 -t1 -J $test_name $file_in2
expect {
	-re "CUDA_VISIBLE_DEVICES:($number) CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
		subtest {$expect_out(1,string) == 0} "Verify CUDA_VISIBLE_DEVICES == 0" "$expect_out(1,string) != 0"
		incr match
		if {$match == 1} {
			set perc $expect_out(2,string)
		} else {
			if {$match == 2} {
				set perc_sum $expect_out(2,string)
			} else {
				if {[subtest {$expect_out(2,string) == $perc_sum} "Verify CUDA percentage == $perc_sum" "$expect_out(2,string) != $perc_sum"]} {
					subtest {[expr $perc - $perc_sum - $expect_out(2,string)] <= 1} "Verify CUDA percentage" "$perc != $perc_sum - $expect_out(2,string)"
				}
			}
		}
		exp_continue
	}
	timeout {
		fail "salloc not responding"
	}
	eof {
		wait
	}
}
subtest {$match == 3} "Verify CUDA information about job" "$match != 3"

#
# Run three steps in parallel to make sure steps get delay as needed to avoid
# oversubscribing consumed MPS resources
#
log_info "==== TEST 4 ===="
set target_job  [expr $mps_cnt / 2]
set target_step [expr $target_job / 2]
make_bash_script $file_in1 "
echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
$bin_sleep 5
exit 0"
make_bash_script $file_in2 "
$srun --exact -n1 --gres=mps:$target_step $file_in1 &
$srun --exact -n1 --gres=mps:$target_step $file_in1 &
$srun --exact -n1 --gres=mps:$target_step $file_in1 &
wait
$bin_date
exit 0"
exec $bin_rm -f $file_out1
set job_id 0
spawn $sbatch --gres=mps:$target_job -w $node_with_mps -n3 -t1 -o $file_out1 -J $test_name $file_in2
expect {
	-re "Submitted batch job ($number)" {
		set job_id $expect_out(1,string)
		lappend job_list $job_id
		exp_continue
	}
	timeout {
		fail "sbatch not responding"
	}
	eof {
		wait
	}
}
if {$job_id == 0} {
	fail "Job not submitted"
}
wait_for_job -fail $job_id "DONE"
wait_for_file -fail $file_out1
set match 0
spawn $bin_cat $file_out1
expect {
	-re "CUDA_VISIBLE_DEVICES:($number) CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
		subtest {$expect_out(1,string) == 0} "Verify CUDA_VISIBLE_DEVICES == 0" "$expect_out(1,string) != 0"
		incr match
		if {$match == 1} {
			set perc $expect_out(2,string)
		} else {
			subtest {$expect_out(2,string) == $perc} "Verify MPS percentage on step" "$expect_out(2,string) != $perc"
		}
		exp_continue
	}
	eof {
		wait
	}
}
subtest {$match == 3} "Verify CUDA information about job" "$match != 3"

log_user 0
set match 0
spawn $bin_cat $file_out1
expect {
	-re "step creation temporarily disabled" {
		incr match
		exp_continue
	}
	eof {
		wait
	}
}
log_user 1
subtest {$match == 1} "Verify delay step for sufficient MPS resources" "$match != 1"

#
# Run step to try to consume gres/mps than allocated to the job
#
log_info "==== TEST 5 ===="
set target_job  [expr $mps_cnt / 2]
set target_step [expr $target_job + 1]
make_bash_script $file_in1 "
echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
$bin_sleep 5
exit 0"
make_bash_script $file_in2 "
$srun --overlap --gres=mps:$target_step $file_in1
exit 0"
exec $bin_rm -f $file_out1
set job_id 0
spawn $sbatch --gres=mps:$target_job -w $node_with_mps -n1 -t1 -o $file_out1 -J $test_name $file_in2
expect {
	-re "Submitted batch job ($number)" {
		set job_id $expect_out(1,string)
		lappend job_list $job_id
		exp_continue
	}
	timeout {
		fail "sbatch not responding"
	}
	eof {
		wait
	}
}
if {$job_id == 0} {
	fail "Job not submitted"
}
wait_for_job -fail $job_id "DONE"
wait_for_file -fail $file_out1
set output [run_command_output -fail "$bin_cat $file_out1"]
subtest {![regexp "CUDA_VISIBLE_DEVICES" $output]} "Verify step didn't run"
subtest {[regexp "Unable to create step" $output]} "Verify step rejection message"

#
# Run multi-node job
#
log_info "==== TEST 6 ===="
set node_cnt [get_mps_node_count]
if {$node_cnt > 2} {
	set node_cnt 2
}
set target_job  [expr $mps_cnt / 2]
make_bash_script $file_in1 "
echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
$bin_sleep 5
exit 0"
set match 0
set hostname "NO_VAL"
set subtest6_nodes [get_nodes_by_request "--gres=mps:$target_job -N$node_cnt -t1"]
if {[llength $subtest6_nodes] != $node_cnt} {
	subskip "Requires being able to submit job with --gres=mps:$target_job -N$node_cnt"
} else {
	spawn $srun --gres=mps:$target_job -N$node_cnt -w[join $subtest6_nodes ","] -t1 -J $test_name $file_in1
	expect {
		-re "HOST:($re_word_str)" {
			incr match
			if {$match == 1} {
				set hostname $expect_out(1,string)
			} elseif {[get_config_param "FrontendName"] ne "MISSING"} {
				log_debug "Duplicate SLURMD_HOSTNAME in front-end mode as expected"
			} else {
				subtest {$expect_out(1,string) != $hostname} "Verify that only 1 task run on same node ($hostname)"
			}
			exp_continue
		}
		timeout {
			fail "srun not responding"
		}
		eof {
			wait
		}
	}
	subtest {$match == $node_cnt} "Verify getting data from multiple nodes" "$match != $node_cnt"
}

#
# Make sure that gres/gpu and gres/mps jobs either do not share the same GPU
# or run at different times
#
if {[get_config_param "FrontendName"] ne "MISSING"} {
	subskip "Doesn't work with front_end"
} else {
	log_info "==== TEST 7 ===="
	set target_job [expr $mps_cnt / 2]
	make_bash_script $file_in1 "
	echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
	$scontrol -dd show job \$SLURM_JOB_ID
	$sbatch --gres=mps:$target_job -w \$SLURMD_NODENAME -n1 -t1 -o $file_out2 -J $test_name $file_in2
	sleep 30
	exit 0"

	make_bash_script $file_in2 "
	echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
	$scontrol -dd show job \$SLURM_JOB_ID
	$squeue --name=$test_name --noheader --state=r --format=\"jobid=%i state=%T\"
	exit 0"

	exec $bin_rm -f $file_out1 $file_out2
	set job_id1 0
	spawn $sbatch --gres=gpu:1 -w $node_with_mps -n1 -t1 -o $file_out1 -J $test_name $file_in1
	expect {
		-re "Submitted batch job ($number)" {
			set job_id1 $expect_out(1,string)
			lappend job_list $job_id1
			exp_continue
		}
		timeout {
			fail "sbatch not responding"
		}
		eof {
			wait
		}
	}
	sleep 30
	wait_for_job -fail $job_id1 "DONE"
	wait_for_file -fail $file_out1
	set dev1 -1
	set job_id2 0
	set match 0
	spawn $bin_cat $file_out1
	expect {
		-re "CUDA_VISIBLE_DEVICES:($number)" {
			incr match
			exp_continue
		}
		-re "GRES=gpu:.*\\(IDX:($number)\\)" {
			set dev1 $expect_out(1,string)
			exp_continue
		}
		-re "Submitted batch job ($number)" {
			set job_id2 $expect_out(1,string)
			lappend job_list $job_id2
			exp_continue
		}
		eof {
			wait
		}
	}
	subtest {$match != 0} "Verify CUDA_VISIBLE_DEVICES is set"
	subtest {$dev1 != -1} "Verify GPU device index is found"
	subtest {$job_id2 != 0} "Verify second job is correctly submitted"

	wait_for_job -fail $job_id2 "DONE"
	wait_for_file -fail $file_out2
	set dev2 -1
	set match 0
	set running 0
	spawn $bin_cat $file_out2
	expect {
		-re "CUDA_VISIBLE_DEVICES:($number)" {
			incr match
			if {$match == 1} {
				set dev2 $expect_out(1,string)
			}
			exp_continue
		}
		-re "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
			incr match
			exp_continue
		}
		-re "GRES=mps:.*\\(IDX:($number)\\)" {
			set dev1 $expect_out(1,string)} {
			exp_continue
		}
		-re "jobid=$job_id1 state=RUNNING" {
			set running 1
			exp_continue
		}
		eof {
			wait
		}
	}
	subtest {$match == 2} "Verify CUDA_VISIBLE_DEVICES and CUDA_MPS_ACTIVE_THREAD_PERCENTAGE are set"
	if {[subtest {$dev2 != -1} "Verify GPU device index is found"]} {
		if {$running == 0} {
			log_debug "The jobs are using the same GPU ($dev1) and running at different times, which is fine"
		} else {
			subtest {$dev1 != $dev2} "Verify the jobs are NOT using the same GPU and running at the same time"
		}
	} else {
		log_debug "The jobs are using different GPUs ($dev1 != $dev2), which is fine"
	}
}
